In [5]:
# General Imports for more than one file
import pandas as pd
import numpy as np
# For reading the CSV
from pandas import read_csv
# Imports for the classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import svm
In [6]:
# Specifying engine = python because c engine can not handle 'sep'
# @params: dataNum
# @output: dataset (Dataframe)
def read(data_num):
dataset = pd.read_csv('data/%d.csv' % (data_num), sep=',', header=None, engine='python', names=names_attributes)
# Comment in for printing out the array data and the size of the array
# print(array_data)
# print(np.size(array_data, 0))
return dataset
In [12]:
#First we transform the Dataframe into an numpy array
#Specifying engine = python because c engine can not handle 'sep'
# @params: dataNum
# @output: dataset (Dataframe)
def read(data_num):
names_attributes = ['sequentialNumber', 'xAcceleration', 'yAcceleration', 'zAcceleration', 'label']
dataset = read_csv('data/%d.csv' % (data_num), sep=',', header=None, engine='python', names=names_attributes)
# Comment in for printing out the array data and the size of the array
# print(array_data)
# print(np.size(array_data, 0))
return dataset
# This function checks how balanced the data is
# @Input: data_array with labels
# @Output: list with different counts for the labels
def count_labels(data_array):
label_data = data_array[:,[4]]
unique, counts = np.unique(label_data, return_counts=True)
ret = dict(zip(unique, counts))
return ret
#3d plot Graph
#Here we reate a simple funcion to create the plot of the data
#of our original dataset. The different 7 colour corrispond our 7 labels
#@input:database in a matrix Array
#@output:plot of the point in a 3d structure
def d3Plot(dataset):
def column(matrix, i):
return [row[i] for row in matrix]
ax = plt.axes(projection='3d')
# Data for three-dimensional scattered points
zdata= column(dataset,3)
ydata= column(dataset,2)
xdata= column(dataset,1)
lable= column(dataset,4)
colors = ['orange','green','blue','purple','yellow','black','orange','white']
ax.scatter3D(xdata, ydata, zdata, c=lable, cmap=matplotlib.colors.ListedColormap(colors))
return plt.show()
In [13]:
# For Feature Extraction we use a technique called window overlapping (Pierluigi Casale, Oriol Pujol, and Petia Radeva. Human activity recognition from accelerometer
# data using a wearable device. Pattern Recognition and Image Analysis, pages 289–296, 2011). It has an overlap of 50% between the different
# time series. As a time window 1 second is use --> corresponds to 52 samplings (52 Hz frequency)
# Then we start with the sequencing
# Slicing needs to be done as follows:
# - it is not possible that 2 activities are grouped in one sequence (would falsify the outcome of the mean value)
# - therefore only labels with the same value are grouped into one sequence
# @params: array_data is list of array that contains the grouped data
# @output: data_list which contains numpy arrays with the respective windows
def grouping(array_data):
start = int(0)
end = int(52)
data_list = []
length = np.size(array_data, 0)
while start < length-52:
if(array_data[start][4] != array_data[end-1][4]): # this control sequence is necessary to ensure that not two of the same
while(array_data[start][4] != array_data[end-1][4]): # labels are in one window
end = end -1
newArray = array_data[slice(start, end)]
start = end
end = end + 52
else:
newArray = array_data[slice(start, end)]
start = start + 26
end = end + 26
data_list.append(newArray)
if(end-52 > length - 1):
end = length-1
# Comment in to show the size and length of the data_list array
# print(np.size(data_list))
# print(len(data_list))
return data_list
# This is an additional function which could be called to print a data list to a text file (e.g to examine it)
# Comment in for printing the data to a text file
# def sysout_to_text(dataList):
# file = open("tempFile", "w")
# for item in dataList:
# file.write("%s\n" % item)
# file.close()
In [17]:
#Feature Extraction
# For Feature Extraction we use a technique called window overlapping (Pierluigi Casale, Oriol Pujol, and Petia Radeva. Human activity recognition from accelerometer
# data using a wearable device. Pattern Recognition and Image Analysis, pages 289–296, 2011). It has an overlap of 50% between the different
# time series. As a time window 1 second is use --> corresponds to 52 samplings (52 Hz frequency)
# Then we start with the sequencing
# Slicing needs to be done as follows:
# - it is not possible that 2 activities are grouped in one sequence (would falsify the outcome of the mean value)
# - therefore only labels with the same value are grouped into one sequence
# @params: array_data is list of array that contains the grouped data
# @output: data_list which contains numpy arrays with the respective windows
def grouping(array_data):
start = int(0)
end = int(52)
data_list = []
length = np.size(array_data, 0)
while start < length-52:
if(array_data[start][4] != array_data[end-1][4]): # this control sequence is necessary to ensure that not two of the same
while(array_data[start][4] != array_data[end-1][4]): # labels are in one window
end = end -1
newArray = array_data[slice(start, end)]
start = end
end = end + 52
else:
newArray = array_data[slice(start, end)]
start = start + 26
end = end + 26
data_list.append(newArray)
if(end-52 > length - 1):
end = length-1
# Comment in to show the size and length of the data_list array
# print(np.size(data_list))
# print(len(data_list))
return data_list
# This is an additional function which could be called to print a data list to a text file (e.g to examine it)
# Comment in for printing the data to a text file
# def sysout_to_text(dataList):
# file = open("tempFile", "w")
# for item in dataList:
# file.write("%s\n" % item)
# file.close()
#Now we need to get the mean value and standard deviation of all windows
#@Params: grouped data_List containing the window arrays
#@Output: mean value of x, y, z, standard deviation of the coordinates, target array
def extract_features(data_list):
total_average_values = []
total_label = []
for row in data_list:
acceleration = np.nanmean(row, 0)
standard_deviation = np.std(row, 0)
temp_features = [acceleration[1], acceleration[2], acceleration[3], standard_deviation[1], standard_deviation[2], standard_deviation[3]]
label_array = [row[0][4]]
total_average_values.append(temp_features)
total_label.append(label_array)
# print(total_average_values)
# print(total_label)
#print(total_average_values)
feature = np.vstack(total_average_values)
target = np.vstack(total_label)
# comment in to print out lists
#print(feature)
#print(target)
return feature, target
In [22]:
#function to classify without crossvalidaion:
#we created here funtion to predict with Random forest and SVM the values with a simple division of the dataset.
#we split infact the dataset in 2 part:Training and test set and we provide the follow estimators:
#1)F1score
#2)Accuracy
#3)Confusion Matrix
def classify(x_features, y_features):
X_train, X_test, y_train, y_test = train_test_split(x_features, y_features.ravel(), test_size=0.2, random_state=0)
X_train.shape, y_train.shape
X_test.shape, y_test.shape
forest= RandomForestClassifier(n_estimators=100, random_state=0)
clf = svm.SVC(kernel='linear', C=1)
model = forest.fit(X_train,y_train )
modelSv=clf.fit(X_train,y_train )
predicted_labels = model.predict(X_test)
predicted_labelsSv = modelSv.predict(X_test)
# Compute the F1 score, also known as balanced F-score or F-measure
# The F1 score can be interpreted as a weighted average of the precision and recall,
# where an F1 score reaches its best value at 1 and worst score at 0.
# The relative contribution of precision and recall to the F1 score are equal.
# The formula for the F1 score is:
# F1 = 2 * (precision * recall) / (precision + recall)
# In the multi-class and multi-label case, this is the weighted average of the F1 score of each class.
print(" F1 score Random forest: %f" % f1_score(y_test, predicted_labels, average='macro'))
print(" F1 score precision with SV: %f" % f1_score(y_test, predicted_labels, average='macro'))
print("Accuracy Random forest: %f" % metrics.accuracy_score(y_test, predicted_labels))
print("Accuracy with SV: %f" % metrics.accuracy_score(y_test, predicted_labelsSv))
#By definition a confusion matrix C is such that C_{i, j}
# is equal to the number of observations known to be in group i but predicted to be in group j.
print("Confusion Matrix Random forest: ")
print(confusion_matrix(y_test, predicted_labels, labels=[1, 2, 3, 4, 5, 6, 7]))
print("Confusion Matrix with SVM: ")
print(confusion_matrix(y_test, predicted_labelsSv, labels=[1, 2, 3, 4, 5, 6, 7]))
return
#function to classify with crossvalidaion:
#1)Accuracy
#2)F1score
def CrossValidation(x_features, y_features, kfold):
scoring = ['accuracy', 'f1_micro']
forest = RandomForestClassifier(n_estimators=100, random_state=0)
clf = svm.SVC(kernel='linear', C=1)
scoresSv = cross_validate(clf, x_features, y_features.ravel(), scoring=scoring, cv=kfold, return_train_score=False)
scores = cross_validate(forest, x_features, y_features.ravel(), scoring=scoring, cv=kfold, return_train_score=False)
print("Accuracy Random Forest: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2))
print("F1 Score Random Forest: %0.2f (+/- %0.2f)" % (scores['test_f1_micro'].mean(), scores['test_f1_micro'].std() * 2))
print("Accuracy SVM: %0.2f (+/- %0.2f)" % (scoresSv['test_accuracy'].mean() ,scoresSv['test_accuracy'].std() * 2))
print("F1 Score SVM: %0.2f (+/- %0.2f)" % (scoresSv['test_f1_micro'].mean() ,scoresSv['test_f1_micro'].std() * 2))
print("Scores for the test folds (Random Forest)", scores['test_accuracy'])
print("Scores for the test folds (Support Vector Machine)", scoresSv['test_accuracy'])
return
In [23]:
# Importing data
dataframe = read(1)
#print(dataframe)
# Delete incorrect Data
cleaned_data = zeroDet(dataframe, 0)
#Check how balanced data is
counts = count_labels(cleaned_data)
print("Instances of every label, starting by one to seven",counts)
#Feature Extraction
grouped_data = grouping(cleaned_data)
features = extract_features(grouped_data)
#we are calling the function for classify our data, first with a simple
#splitting of the dataset to divide test and training set, after
#using k-crossvalidation with two different train model: 1)RandomForest classifier , 2)Super Vector Machine
classify(features.__getitem__(0),features.__getitem__(1))
CrossValidation(features.__getitem__(0),features.__getitem__(1), 5)
In [ ]: